.csv of SFARI genes information downloaded from https://gene.sfari.org/database/gene-scoring/. Downloaded version 01-15-19.
.csv file for each of the SFARI genes with the information of all the reports related to it obtained from the SFARI website https://gene.sfari.org/database/human-gene/gene_name using scrapy. Date of web scraping: 20/05/19, script used: ./SFARI_scraping/SFARI_scraping/spiders/SFARI_scraper.py
For each gene, consider all reports, including the ones not related to ASD.
# Select if results should be filtered to ASD related or not
filter_ASD = FALSE
### Get list of genes and pubmed IDs
# Genes
list_gene_files = list.files('./../SFARI_scraping/genes/')
# Pubmed IDs
get_pubmed_IDs = function(autism=FALSE){
all_pubmed_ids = c()
n = 0
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
if(autism) file = file %>% filter(Autism.Report == 'Yes')
if(nrow(file)>0){
pubmed_ids = unique(file$pubmed.ID)
all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids))
}
n = c(n, length(all_pubmed_ids))
}
all_pubmed_ids = as.character(all_pubmed_ids)
return(list('n'=n, 'all_pubmed_ids'=all_pubmed_ids))
}
get_pubmed_IDs_output = get_pubmed_IDs(filter_ASD)
n = get_pubmed_IDs_output$n
all_pubmed_ids = get_pubmed_IDs_output$all_pubmed_ids
# See change in number of pubmed articles by each new gene (Seems like the number of new articles were beginning to decrease in the end)
pubmed_counts = data.frame('genes' = seq(1,length(n)), 'articles'=n)
ggplotly(pubmed_counts %>% ggplot(aes(genes, articles,color='#434343')) + geom_line() +
geom_smooth(method='lm', color='#666666', size=0.5) + theme_minimal() + theme(legend.position='none') +
ggtitle('Increase in number of pubmed articles by each new gene'))
rm(n, get_pubmed_IDs_output)
# Create gene - pubmedID sparse matrix
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)
create_gene_pmID_mat = function(autism=FALSE){
# Create empty dataframe
gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
colnames(gene_pmID_mat) = all_pubmed_ids
# Fillout dataframe
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
if(autism) file = file %>% filter(Autism.Report == 'Yes')
if(nrow(file)>0){
pubmed_ids = as.character(unique(file$pubmed.ID))
gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1
}
}
gene_names = rownames(gene_pmID_mat)
sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
return(list('gene_names'=gene_names, 'sparse_gene_pmID_mat'=sparse_gene_pmID_mat))
}
gene_pmID_mat_output = create_gene_pmID_mat(filter_ASD)
gene_names = gene_pmID_mat_output$gene_names
gene_pmID_mat = gene_pmID_mat_output$sparse_gene_pmID_mat
plot_data = data.frame('ID' = rownames(gene_pmID_mat), 'n_papers'=rowSums(gene_pmID_mat)) %>%
left_join(SFARI_genes_info, by='ID') %>% mutate(gene.score=as.factor(gene.score))
bins = max(plot_data$n_papers)-min(plot_data$n_papers)+1
p1 = ggplotly(plot_data %>% ggplot(aes(n_papers)) + geom_histogram(bins=bins, alpha=0.6) +
theme_minimal() + ggtitle('Distribution of number of papers per gene'))
p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, n_papers, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() +
ggtitle('Number of papers per gene (left) and by SFARI score (right)'))
subplot(p1, p2, nrows=1)
# Remove genes with no articles related to them
if(sum(rowSums(gene_pmID_mat)==0)>0){
print(paste0('Removing ', sum(rowSums(gene_pmID_mat)==0), ' rows with zero counts from gene-pubmedID count matrix'))
gene_pmID_mat = gene_pmID_mat[rowSums(gene_pmID_mat)>0,]
}
# Nodes
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>%
mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
shape=ifelse(syndromic==0, 'circle','square'),
color=case_when(gene.score==1 ~ '#FF7631', gene.score==2 ~ '#FFB100',
gene.score==3 ~ '#E8E328', gene.score==4 ~ '#8CC83F',
gene.score==5 ~ '#62CCA6', gene.score==6 ~ '#59B9C9',
is.na(gene.score) ~ '#b3b3b3')) %>%
filter(!duplicated(id))
# Edges
edges = tibble('from'=character(), 'to'=character(), 'weight'=integer())
gene_gene_mat = gene_pmID_mat %*% t(gene_pmID_mat)
edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'weight'=x) %>%
mutate(from = gene_names[from], to = gene_names[to]) %>%
filter(from!=to)
nodes = nodes %>% filter(gene.symbol %in% unique(c(edges$from, edges$to)))
# Details:
print(paste0('Nodes: ', nrow(nodes),' Edges: ', nrow(edges)))
## [1] "Nodes: 955 Edges: 167932"
print(paste0('Lost ', length(unique(SFARI_genes_info$gene.symbol[!SFARI_genes_info$gene.symbol %in% nodes$id])),
' genes because they didn\'t share any publication with other genes'))
## [1] "Lost 98 genes because they didn't share any publication with other genes"
Lost more genes from lower scores, not a single one from score 1
table(SFARI_genes_info$gene.score[!SFARI_genes_info$gene.symbol %in% nodes$gene.symbol], useNA='ifany')
##
## 2 3 4 5 <NA>
## 1 10 33 40 15
Count of original genes by score:
table(SFARI_genes_info$gene.score, useNA='ifany')
##
## 1 2 3 4 5 6 <NA>
## 25 62 195 454 175 25 118
rm(gene_gene_mat)
The correlation between eigencentrality and SFARI scores is strong (it’s negative because the highest SFARI scores have the lowest numbers)
graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
# Eigencentrality
eigen_centrality_output = eigen_centrality(graph, weights=edges$weight)
eigencentr = eigen_centrality_output$vector
# Strong correlation (negative because 'higher' SFARI scores have actually lower numbers)
cor_SFARI_eigencentr = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
plot_data = data.frame('gene.score' = as.character(nodes$gene.score), 'Eigencentrality'=eigencentr)
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() +
ggtitle(paste0('Correlation = ', round(cor_SFARI_eigencentr, 4))))
rm(cor_SFARI_eigencentr, plot_data)
There seems to be a relation between number of papers and eigencentrality for all scores
nodes = nodes %>% mutate('eigencentrality'=eigencentr, 'n.papers'=value, 'gene.score'=as.character(gene.score))
ggplotly(nodes %>% ggplot(aes(n.papers, eigencentrality, fill=gene.score, color=gene.score)) +
geom_point(alpha=0.5, aes(id=gene.symbol)) + geom_smooth(method='lm', fill=NA, size=0.5) +
theme_minimal() + ggtitle('N papers vs Eigencentrality'))
Note: Filtered the edges with a weight of one and removed the resulting unconnected nodes to improve visualisation of the network
edges_filtered = edges %>% filter(weight>1)
nodes_filtered = nodes %>% filter(gene.symbol %in% unique(c(edges_filtered$from, edges_filtered$to)))
visNetwork(nodes_filtered, edges_filtered, height = '700px', width='100%') %>%
visOptions(selectedBy='gene.score') %>% visEdges(smooth=TRUE) %>%
visIgraphLayout() %>% visLayout(randomSeed=12)
rm(edges_filtered, nodes_filtered)